{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['/home/magupta/anaconda3/envs/stackoverflow_env/RedHat', '/usr/lib64/python37.zip', '/usr/lib64/python3.7', '/usr/lib64/python3.7/lib-dynload', '', '/home/magupta/.local/lib/python3.7/site-packages', '/usr/local/lib64/python3.7/site-packages', '/usr/local/lib/python3.7/site-packages', '/usr/lib64/python3.7/site-packages', '/usr/lib/python3.7/site-packages', '/usr/local/lib/python3.7/site-packages/IPython/extensions', '/home/magupta/.ipython']\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import sys\n",
    "print(sys.path)\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.feature_extraction.text import TfidfTransformer\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.metrics import mean_squared_error\n",
    "from math import sqrt\n",
    "from sklearn.metrics import f1_score, accuracy_score , recall_score , precision_score\n",
    "from sklearn.metrics import confusion_matrix\n",
    "from sklearn.model_selection import cross_val_score, GridSearchCV\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from numpy import argmax\n",
    "from sklearn.metrics import classification_report\n",
    "from sklearn.model_selection import StratifiedShuffleSplit\n",
    "from collections import Counter\n",
    "from sklearn.metrics import confusion_matrix\n",
    "from xgboost import XGBClassifier\n",
    "import os\n",
    "os.environ['KMP_DUPLICATE_LIB_OK']='True'\n",
    "from xgboost import XGBClassifier\n",
    "import pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib64/python3.7/site-packages/numpy/lib/arraysetops.py:569: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison\n",
      "  mask |= (ar1 == a)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(1264212, 27)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_full=pd.read_csv(\"data_stackOverflow/final_dataframe_with_1tag_onehot.csv\",index_col=0)\n",
    "df_full.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "label_col = [col for col in df_full if col.startswith('tag_')]\n",
    "df_full.drop(label_col,inplace=True,axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ques_title</th>\n",
       "      <th>ques_body</th>\n",
       "      <th>ques_score_bad</th>\n",
       "      <th>ques_score_medium</th>\n",
       "      <th>ques_score_good</th>\n",
       "      <th>maintag</th>\n",
       "      <th>all_tags</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>sqlstatement execut multipl queri one statement</td>\n",
       "      <td>i written databas generat script href http en ...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>other</td>\n",
       "      <td>other/other/other</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>good branch merg tutori tortoisesvn</td>\n",
       "      <td>realli good tutori explain href http svnbook r...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>other</td>\n",
       "      <td>other/other/other/other</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        ques_title  \\\n",
       "0  sqlstatement execut multipl queri one statement   \n",
       "1              good branch merg tutori tortoisesvn   \n",
       "\n",
       "                                           ques_body  ques_score_bad  \\\n",
       "0  i written databas generat script href http en ...               0   \n",
       "1  realli good tutori explain href http svnbook r...               0   \n",
       "\n",
       "   ques_score_medium  ques_score_good maintag                 all_tags  \n",
       "0                  0                1   other        other/other/other  \n",
       "1                  0                1   other  other/other/other/other  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_full.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "other            424440\n",
       "javascript       118826\n",
       "java             114208\n",
       "c#               100192\n",
       "php               92015\n",
       "android           71962\n",
       "python            62622\n",
       "c++               45318\n",
       "ios               36617\n",
       "jquery            31429\n",
       "ruby-on-rails     22556\n",
       "html              22276\n",
       "sql               22094\n",
       "mysql             20254\n",
       "c                 18987\n",
       "iphone            17165\n",
       "asp.net           13329\n",
       "css               11288\n",
       "objective-c       10055\n",
       ".net               8546\n",
       "Name: maintag, dtype: int64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_full[\"maintag\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ques_title</th>\n",
       "      <th>ques_body</th>\n",
       "      <th>ques_score_bad</th>\n",
       "      <th>ques_score_medium</th>\n",
       "      <th>ques_score_good</th>\n",
       "      <th>maintag</th>\n",
       "      <th>all_tags</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>33127</th>\n",
       "      <td>jsp portlet submit form valu</td>\n",
       "      <td>use liferay portal x deploy simpl portlet use ...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>nan/other/other/other</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>154359</th>\n",
       "      <td>sitecor context item null postback</td>\n",
       "      <td>inherit sitecor implement tri debug membership...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>nan/other/other/other</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                ques_title  \\\n",
       "33127         jsp portlet submit form valu   \n",
       "154359  sitecor context item null postback   \n",
       "\n",
       "                                                ques_body  ques_score_bad  \\\n",
       "33127   use liferay portal x deploy simpl portlet use ...               0   \n",
       "154359  inherit sitecor implement tri debug membership...               0   \n",
       "\n",
       "        ques_score_medium  ques_score_good maintag               all_tags  \n",
       "33127                   1                0     NaN  nan/other/other/other  \n",
       "154359                  0                1     NaN  nan/other/other/other  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_full.iloc[df_full['all_tags'].values==\"nan/other/other/other\"].head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(5.79542197036573, 22, 1, 136.90592954346263, 8469, 1)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "c=df_full['ques_title'].apply(lambda x: len(str(x).split()))\n",
    "d=df_full['ques_body'].apply(lambda x: len(str(x).split()))\n",
    "c.mean(),c.max(),c.min(),d.mean(),d.max(),d.min()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_full['text']=df_full['ques_title']+df_full['ques_body']\n",
    "df_full.drop(['ques_title','ques_body'],inplace=True,axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1264179, 6)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_full = df_full[df_full.maintag.notnull()]\n",
    "df_full.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([15, 19,  4,  5,  0,  7,  3,  2, 16, 10, 11,  6, 14, 13, 17, 18, 12,\n",
       "        1,  9,  8])"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "label_encoder = LabelEncoder() \n",
    "df_full['maintag']= label_encoder.fit_transform(df_full['maintag'].astype(str)) \n",
    "df_full['maintag'].unique() "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['.net', 'android', 'asp.net', 'c', 'c#', 'c++', 'css', 'html',\n",
       "       'ios', 'iphone', 'java', 'javascript', 'jquery', 'mysql',\n",
       "       'objective-c', 'other', 'php', 'python', 'ruby-on-rails', 'sql'],\n",
       "      dtype=object)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "label_encoder.classes_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'.net': 0,\n",
       " 'android': 1,\n",
       " 'asp.net': 2,\n",
       " 'c': 3,\n",
       " 'c#': 4,\n",
       " 'c++': 5,\n",
       " 'css': 6,\n",
       " 'html': 7,\n",
       " 'ios': 8,\n",
       " 'iphone': 9,\n",
       " 'java': 10,\n",
       " 'javascript': 11,\n",
       " 'jquery': 12,\n",
       " 'mysql': 13,\n",
       " 'objective-c': 14,\n",
       " 'other': 15,\n",
       " 'php': 16,\n",
       " 'python': 17,\n",
       " 'ruby-on-rails': 18,\n",
       " 'sql': 19}"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "le_name_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))\n",
    "le_name_mapping"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ques_score_bad</th>\n",
       "      <th>ques_score_medium</th>\n",
       "      <th>ques_score_good</th>\n",
       "      <th>maintag</th>\n",
       "      <th>all_tags</th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>15</td>\n",
       "      <td>other/other/other</td>\n",
       "      <td>sqlstatement execut multipl queri one statemen...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>15</td>\n",
       "      <td>other/other/other/other</td>\n",
       "      <td>good branch merg tutori tortoisesvnrealli good...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   ques_score_bad  ques_score_medium  ques_score_good  maintag  \\\n",
       "0               0                  0                1       15   \n",
       "1               0                  0                1       15   \n",
       "\n",
       "                  all_tags                                               text  \n",
       "0        other/other/other  sqlstatement execut multipl queri one statemen...  \n",
       "1  other/other/other/other  good branch merg tutori tortoisesvnrealli good...  "
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_full.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "score_col = [col for col in df_full if col.startswith('ques_score_')]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Applying tfidf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "def tf_idf(df,flag,tfidf_text):\n",
    "    if (flag=='train'):\n",
    "        xyz = tfidf_text.fit_transform(df['text'].values.astype('U')).toarray().tolist()\n",
    "    else:\n",
    "        xyz = tfidf_text.transform(df['text'].values.astype('U')).toarray().tolist()\n",
    "        \n",
    "    return xyz"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Taking 5k samples from every class present in original dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "sample_size=5000\n",
    "df_equal_samples=pd.DataFrame(df_full.groupby('maintag').apply(lambda x: x.sample(sample_size)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "equal_train, equal_val_test = train_test_split(df_equal_samples, test_size=0.2)\n",
    "equal_val,equal_test=train_test_split(equal_val_test, test_size=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((100000, 6), (80000, 6), (10000, 6), (10000, 6))"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_equal_samples.shape,equal_train.shape,equal_val.shape,equal_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "equal_x_train = equal_train.drop(['all_tags','maintag'],axis=1)\n",
    "equal_x_val = equal_val.drop(['all_tags','maintag'],axis=1)\n",
    "equal_x_test = equal_test.drop(['all_tags','maintag'],axis=1)\n",
    "equal_y_train=equal_train['maintag']\n",
    "equal_y_val=equal_val['maintag']\n",
    "equal_y_test=equal_test['maintag']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((80000, 4), (80000,), (10000, 4), (10000,), (10000, 4), (10000,))"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "equal_x_train.shape,equal_y_train.shape,equal_x_val.shape,equal_y_val.shape,equal_x_test.shape,equal_y_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['ques_score_bad', 'ques_score_medium', 'ques_score_good', 'text'], dtype='object')"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "equal_x_train.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "equal_tfidf_text = TfidfVectorizer(lowercase=True,ngram_range=(1,3),max_features=300) # max_df=0.9, min_df=0.1\n",
    "\n",
    "equal_train_tfidf = pd.DataFrame(tf_idf(equal_x_train,'train',equal_tfidf_text ))\n",
    "equal_val_tfidf= pd.DataFrame(tf_idf(equal_x_val,'val',equal_tfidf_text ))\n",
    "equal_test_tfidf=  pd.DataFrame(tf_idf(equal_x_test,'test',equal_tfidf_text ))\n",
    "\n",
    "equal_train_features = pd.DataFrame(np.hstack([equal_train_tfidf, equal_x_train[score_col]]))\n",
    "equal_val_features= pd.DataFrame(np.hstack([equal_val_tfidf, equal_x_val[score_col]]))\n",
    "equal_test_features=  pd.DataFrame(np.hstack([equal_test_tfidf, equal_x_test[score_col]]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'equal_tfidf_text' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-43-5baa4b74a860>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mequal_tfidf_text\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvocabulary_\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m: name 'equal_tfidf_text' is not defined"
     ]
    }
   ],
   "source": [
    "len(equal_tfidf_text.vocabulary_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(80000, 300) (10000, 300) (10000, 300)\n",
      "(80000, 303) (10000, 303) (10000, 303)\n"
     ]
    }
   ],
   "source": [
    "print(equal_train_tfidf.shape,equal_val_tfidf.shape,equal_test_tfidf.shape)\n",
    "print(equal_train_features.shape,equal_val_features.shape,equal_test_features.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "equal_text_feature_names = np.array(equal_tfidf_text.get_feature_names())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Applying random Forest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_1 = RandomForestClassifier(n_estimators=200, oob_score='TRUE', n_jobs=-1, random_state=50, max_features=\"auto\",min_samples_leaf=1)\n",
    "model_1.fit(equal_train_features, equal_y_train)\n",
    "equal_y_pred = model_1.predict(equal_test_features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy of Random Forest: 0.5758\n"
     ]
    }
   ],
   "source": [
    "print(\"accuracy of Random Forest:\",accuracy_score(equal_y_pred,equal_y_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[250   2  42   9  53  20   3   6   5  19  10  10   5   3  18  21   1   3\n",
      "    2  26]\n",
      " [  5 382   1   1   6   3   1   3   5  33  15   1   0   2  12   5   2   2\n",
      "    0   0]\n",
      " [ 27   0 311   1  33   2  12   7   3  18   3  10  11   3   3  12   1   2\n",
      "    1  15]\n",
      " [  9   0   2 361   4  49   0   2   4   9   7   3   0   1  12  26   6  11\n",
      "    1   5]\n",
      " [ 66   5  80  16 159  21   3  11  10  19  20   8   4   2  16  22   3   6\n",
      "    2  15]\n",
      " [ 13   3   0 108   9 272   1   2   6  14   4   5   0   1  17  28   4   8\n",
      "    0   0]\n",
      " [  5   2   3   3   0   0 324 143   5  13   1   8  17   0   2   3   5   0\n",
      "    7   0]\n",
      " [  2   0  12   0   3   1 144 258   3  10   2  18  17   1   1   6  10   5\n",
      "   10   3]\n",
      " [  4   4   5   8   3  11   1   1 265 100   4  11   1   1  91  11   2   6\n",
      "    5   1]\n",
      " [  9   7   2  17   5  10   8   6 102 205   3   3   4   2  87  17   3   4\n",
      "    1   9]\n",
      " [ 13  62   3  14  31  10   2   7   1  12 278   2   0   5   9  20   3   7\n",
      "    3   7]\n",
      " [  1  13   8   7   8   4  15  49  11   8   5 200 138   0  12   9  25   5\n",
      "    6   5]\n",
      " [  1   4  12   0   0   1  16  24   2   4   1  52 352   2   0   3   4   0\n",
      "    5   3]\n",
      " [  3   1   5   1   0   0   0   0   0   0   6   0   2 319   0   5  23   2\n",
      "    9 104]\n",
      " [ 10   1   2  18  10  19   1   6  96  82   5   5   0   0 189  18   3   8\n",
      "    3   7]\n",
      " [ 32  12  22  36  32  28  10  19  22  39  24  35   8   6  13  81  18  40\n",
      "   15  26]\n",
      " [  5   4   3   2   6   6   2  11   1   4   0  13  16  35   5  18 359   3\n",
      "    4  14]\n",
      " [  6   2   1  12   3   7   1   5   7   9   2   7   1   1  11  12   4 391\n",
      "    5  10]\n",
      " [  2   2   2   1   1   0   4   5   2   8   2   6   2   6   6   9   4   1\n",
      "  408  12]\n",
      " [  3   0   6   2   4   1   1   2   0   1   4   1   0  38   4   7   6   3\n",
      "    4 394]]\n"
     ]
    }
   ],
   "source": [
    "conf_mat = confusion_matrix(equal_y_test, equal_y_pred)\n",
    "print(conf_mat)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.54      0.49      0.51       508\n",
      "           1       0.75      0.80      0.78       479\n",
      "           2       0.60      0.65      0.62       475\n",
      "           3       0.59      0.71      0.64       512\n",
      "           4       0.43      0.33      0.37       488\n",
      "           5       0.58      0.55      0.57       495\n",
      "           6       0.59      0.60      0.59       541\n",
      "           7       0.46      0.51      0.48       506\n",
      "           8       0.48      0.50      0.49       535\n",
      "           9       0.34      0.41      0.37       504\n",
      "          10       0.70      0.57      0.63       489\n",
      "          11       0.50      0.38      0.43       529\n",
      "          12       0.61      0.72      0.66       486\n",
      "          13       0.75      0.66      0.70       480\n",
      "          14       0.37      0.39      0.38       483\n",
      "          15       0.24      0.16      0.19       518\n",
      "          16       0.74      0.70      0.72       511\n",
      "          17       0.77      0.79      0.78       497\n",
      "          18       0.83      0.84      0.84       483\n",
      "          19       0.60      0.82      0.69       481\n",
      "\n",
      "    accuracy                           0.58     10000\n",
      "   macro avg       0.57      0.58      0.57     10000\n",
      "weighted avg       0.57      0.58      0.57     10000\n",
      "\n"
     ]
    }
   ],
   "source": [
    "report=classification_report(equal_y_test, equal_y_pred)\n",
    "print(report)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Taking Random 100k samples from original dataframe thus having stratified split."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "15    33753\n",
       "11     9509\n",
       "10     8959\n",
       "4      7897\n",
       "16     7227\n",
       "1      5759\n",
       "17     5078\n",
       "5      3527\n",
       "8      2887\n",
       "12     2375\n",
       "7      1784\n",
       "18     1752\n",
       "19     1656\n",
       "13     1614\n",
       "3      1502\n",
       "9      1367\n",
       "2      1040\n",
       "6       824\n",
       "14      803\n",
       "0       687\n",
       "Name: maintag, dtype: int64"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df=df_full.sample(n=100000)\n",
    "df['maintag'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv(\"sample_data.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "train, val_test = train_test_split(df, test_size=0.2)\n",
    "val,test=train_test_split(val_test, test_size=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((100000, 6), (80000, 6), (10000, 6), (10000, 6))"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.shape,train.shape,val.shape,test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['ques_score_bad', 'ques_score_medium', 'ques_score_good', 'maintag',\n",
       "       'all_tags', 'text'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_train = train.drop(['all_tags','maintag'],axis=1)\n",
    "x_val = val.drop(['all_tags','maintag'],axis=1)\n",
    "x_test = test.drop(['all_tags','maintag'],axis=1)\n",
    "y_train=train['maintag']\n",
    "y_val=val['maintag']\n",
    "y_test=test['maintag']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((80000, 4), (80000,), (10000, 4), (10000,), (10000, 4), (10000,))"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_train.shape,y_train.shape,x_val.shape,y_val.shape,x_test.shape,y_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "tfidf_text = TfidfVectorizer(lowercase=True,ngram_range=(1,3),max_features=500) # max_df=0.9, min_df=0.1\n",
    "\n",
    "train_tfidf = pd.DataFrame(tf_idf(x_train,'train',tfidf_text))\n",
    "val_tfidf= pd.DataFrame(tf_idf(x_val,'val',tfidf_text))\n",
    "test_tfidf=  pd.DataFrame(tf_idf(x_test,'test',tfidf_text))\n",
    "\n",
    "train_features = pd.DataFrame(np.hstack([train_tfidf, x_train[score_col]]))\n",
    "val_features= pd.DataFrame(np.hstack([val_tfidf, x_val[score_col]]))\n",
    "test_features=  pd.DataFrame(np.hstack([test_tfidf, x_test[score_col]]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "500"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(tfidf_text.vocabulary_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(80000, 500) (10000, 500) (10000, 500)\n",
      "(80000, 503) (10000, 503) (10000, 503)\n"
     ]
    }
   ],
   "source": [
    "print(train_tfidf.shape,val_tfidf.shape,test_tfidf.shape)\n",
    "print(train_features.shape,val_features.shape,test_features.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "text_feature_names = np.array(tfidf_text.get_feature_names())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_2 = RandomForestClassifier(n_estimators=100, oob_score='TRUE', n_jobs=-1, random_state=50, max_features=\"auto\",min_samples_leaf=1)\n",
    "model_2.fit(train_features, y_train)\n",
    "y_pred = model_2.predict(test_features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy of Random Forest: 0.6069\n"
     ]
    }
   ],
   "source": [
    "print(\"accuracy of Random Forest:\",accuracy_score(y_pred,y_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[   0    0    0    0   16    0    1    0    0    0    1    1    0    0\n",
      "     0   51    0    0    0    0]\n",
      " [   0  479    0    0    3    0    0    1    0    0   40    5    0    0\n",
      "     0   93    1    1    0    0]\n",
      " [   0    0   14    0   23    0    0    2    0    0    0    6    0    1\n",
      "     0   59    1    0    0    1]\n",
      " [   0    0    0   69    2   14    0    0    0    0    2    0    0    0\n",
      "     0   73    1    3    0    0]\n",
      " [   0    1   12    0  316    3    0    1    0    0   23   20    3    4\n",
      "     0  444    6    3    0    1]\n",
      " [   0    2    0   14    5  142    0    0    0    0    2    1    0    0\n",
      "     0  161    0    3    0    0]\n",
      " [   0    0    0    0    0    1   12   21    0    0    0    9    0    0\n",
      "     0   34    0    0    0    0]\n",
      " [   0    0    0    0    1    0    5   86    0    0    0   21    0    0\n",
      "     0   71    9    0    1    0]\n",
      " [   0    3    0    1    1    0    0    0  106    0    0    1    0    0\n",
      "     0  147    2    1    0    0]\n",
      " [   0    1    0    1    0    2    0    0   30    0    1    3    0    0\n",
      "     0   85    2    0    0    0]\n",
      " [   0   90    1    2   22    0    0    3    2    0  532    6    0    1\n",
      "     0  251    6    3    0    0]\n",
      " [   0    2    1    1    1    0    2   11    2    0    3  630   26    0\n",
      "     0  209   33    1    2    0]\n",
      " [   0    0    0    0    2    0    2    5    0    0    3  132   51    0\n",
      "     0   31    9    1    0    0]\n",
      " [   0    0    0    0    1    0    0    0    0    0    6    0    0   91\n",
      "     0   41   13    0    0   16]\n",
      " [   0    0    0    0    0    2    0    0   15    0    0    0    0    0\n",
      "     0   48    0    0    0    0]\n",
      " [   0   40    7    9   89   18    1    5   31    1  132  163    9   16\n",
      "     0 2612  107   60    7   21]\n",
      " [   0    1    0    1    0    0    0    2    0    0    3   40    2   19\n",
      "     0  131  509    0    0    9]\n",
      " [   0    2    0    0    0    0    0    0    2    0    1    2    0    2\n",
      "     0  137    4  340    0    1]\n",
      " [   0    0    0    0    0    0    0    1    1    0    1    8    0    1\n",
      "     0  142    2    2   31    0]\n",
      " [   0    0    0    0    0    0    0    0    0    0    3    0    0    7\n",
      "     0  109    5    0    1   49]]\n"
     ]
    }
   ],
   "source": [
    "conf_mat = confusion_matrix(y_test, y_pred)\n",
    "print(conf_mat)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.00      0.00      0.00        70\n",
      "           1       0.77      0.77      0.77       623\n",
      "           2       0.40      0.13      0.20       107\n",
      "           3       0.70      0.42      0.53       164\n",
      "           4       0.66      0.38      0.48       837\n",
      "           5       0.78      0.43      0.55       330\n",
      "           6       0.52      0.16      0.24        77\n",
      "           7       0.62      0.44      0.52       194\n",
      "           8       0.56      0.40      0.47       262\n",
      "           9       0.00      0.00      0.00       125\n",
      "          10       0.71      0.58      0.64       919\n",
      "          11       0.60      0.68      0.64       924\n",
      "          12       0.56      0.22      0.31       236\n",
      "          13       0.64      0.54      0.59       168\n",
      "          14       0.00      0.00      0.00        65\n",
      "          15       0.53      0.78      0.63      3328\n",
      "          16       0.72      0.71      0.71       717\n",
      "          17       0.81      0.69      0.75       491\n",
      "          18       0.74      0.16      0.27       189\n",
      "          19       0.50      0.28      0.36       174\n",
      "\n",
      "    accuracy                           0.61     10000\n",
      "   macro avg       0.54      0.39      0.43     10000\n",
      "weighted avg       0.61      0.61      0.59     10000\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib64/python3.7/site-packages/sklearn/metrics/_classification.py:1272: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, msg_start, len(result))\n"
     ]
    }
   ],
   "source": [
    "report=classification_report(y_test, y_pred)\n",
    "print(report)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Text(0.5, 0, 'Relative Importance')"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAEWCAYAAACdaNcBAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8GearUAAAgAElEQVR4nO3deZhdZZXv8e9iCCETSQhDgECYFVEREUFRsOXaCig8aiuoKLe1vYit7WPTXr2iguLYV722PlfbFkVFQMEWbRQVL4qKKBIgDAJCMAxhEAiQoZKQYd0/9nuqDrvWe/JW1dk1nPp9nqeeOrXOe/bZOwdWrXr3u9c2d0dERHrLFmO9AyIi0n1K7iIiPUjJXUSkBym5i4j0ICV3EZEepOQuItKDlNxFRHqQkrsUM7OlZrbGzFa1fe3ShW0e3a19LHi/M83svNF6v07M7BQz++1Y74f0JiV3GapXuvuMtq/7x3JnzGyrsXz/4Zqo+y0Th5K7jJiZbWdm55jZA2a2zMzONrMt03N7m9kVZvaomT1iZt8xs9npuW8DuwP/lf4KeJ+ZHWVm99W231/dp8r7YjM7z8xWAKd0ev+CfXczO83M7jCzlWb2sbTPvzOzFWb2PTObksYeZWb3mdn/Ssey1MzeWPt3+JaZPWxmd5vZGWa2RXruFDO7ysw+b2aPAt8FvgIcno798TTuWDO7Pr33vWZ2Ztv2F6b9fYuZ3ZP24YNtz2+Z9m1JOpZFZrYgPfc0M7vczJab2e1m9rq21x1jZn9Kr1lmZqcXf/gybim5SzecC2wA9gGeA7wMeFt6zoBPArsATwcWAGcCuPvJwD0M/DXwmcL3Ox64GJgNfGcz71/ib4HnAocB7wO+Crwp7euBwEltY3cG5gG7Am8Bvmpm+6fnvghsB+wFHAm8Gfjvba99PnAXsFPa/qnA1enYZ6cxq9PrZgPHAu8wsxNq+3sEsD/wUuDDZvb0FH9v2tdjgFnA3wN9ZjYduBw4H9gROBH4v2Z2QHrdOcD/cPeZ6XivKPpXk3FNyV2G6hIzezx9XWJmO1Elk/e4+2p3/yvweaoEgrvf6e6Xu/s6d38Y+BxV4huJq939EnffRJXEsu9f6DPuvsLdbwFuBn7u7ne5+xPAZVS/MNp9KB3PlcCPgdelvxROBD7g7ivdfSnwWeDkttfd7+5fdPcN7r4m2hF3/5W73+Tum9z9RuACBv97neXua9x9MbAYeHaKvw04w91v98pid38UOA5Y6u7fSO99PfB94O/S69YDB5jZLHd/zN2vG8K/nYxTmveToTrB3X/R+sHMDgW2Bh4ws1Z4C+De9PxOwBeAFwEz03OPjXAf7m17vEen9y/0UNvjNcHPO7f9/Ji7r277+W6qv0rmpf24u/bcrpn9DpnZ84FPUVXQU4BtgItqwx5se9wHzEiPFwBLgs3uATy/NfWTbAV8Oz1+DXAG8CkzuxF4v7tfvbl9lfFNlbuM1L3AOmCeu89OX7Pc/Rnp+U8ADjzT3WdRTUdY2+vrbUlXA9NaP6SKeIfamPbXbO79u21OmuZo2R24H3iEqgLeo/bcssx+Rz9DNXXyI2CBu29HNS9vwbjIvcDemfiVbf8+s9NU0DsA3P2P7n481ZTNJcD3Ct9PxjEldxkRd38A+DnwWTObZWZbpBOSramEmcAq4Akz2xX4l9omHqKao275MzA1nVjcmqqi3GYE79+Es8xsipm9iGrK4yJ330iVFD9uZjPNbA+qOfBOyy4fAnZrnbBNZgLL3X1t+qvoDUPYr68BHzOzfa3yLDPbHrgU2M/MTjazrdPX88zs6ek43mhm27n7emAFsGkI7ynjlJK7dMObqaYQ/kQ15XIxMD89dxZwMPAE1fz0f9Ze+0ngjDSHf3qa5z6NKlEto6rk76OzTu/fbQ+m97if6mTuqe5+W3ruXVT7exfwW6oq/OsdtnUFcAvwoJk9kmKnAR81s5XAhxlaFf25NP7nVEn6HGBbd19JdZL5xLTfDwKfZuCX5snA0rT66FTgjciEZ7pZh0gZMzsKOM/ddxvrfRHZHFXuIiI9SMldRKQHaVpGRKQHqXIXEelB4+Iipnnz5vnChQvHejdERCaURYsWPeLu9etAgHGS3BcuXMi111471rshIjKhmNnduec0LSMi0oOU3EVEepCSu4hID1JyFxHpQUruIiI9aLPJ3cymmtk1ZrbYzG4xs7NS/JwUu9Gq257NSPFT0m3GbkhfQ7kjjoiIdEHJUsjdgKdRdd0D+Cczu4yqjev+VJ3lDgSmA6+gumuNUd3gYCeqW5d9rbu7LSIinZRMy+ybvrduGLAdcDTwWqo7z6xLz+2Unp9BlehbP7/VzA7qyt6KiEiRksr97VR3jNnEQPV+AlW/bGOgJ3QrgU8Hpra93t39hvpGzeztadvA7ljpvWZERHpEk629SpL754GPA1czUOlvZCDhA2wJYGaHU1X07czM5qc75rR7EZDu2r5qqPstIhOUehWOjpLkfhvwAE+dwtmCKsFvxcB0zXpgKVWib318reeembbRbl/6K/y1Q9pp6R36H12kGSVz7vMZXFovoJpvb59M2TrFSfH2504LtnsHVVZf+9RZHJlI3Ef2JSLNKKncrwBm1WKrg3FGdT/MyEVBrL9ynzZtLaujLYqIyLCUJPc1QSxsMQkcnonfEsTuoJquoa9v6jSdUB1dqppFeltJcr8H2LUWm54Zey+wXxB/KIipchcRaUhJcjfgSQaWQbZikScy8WOAc3Jv0NeHlkJuhiptERmKkhOq9zE4medSzZOZ+MYgphOqiU48iki3lST326lWwrR7MDM2Nxc/I4i1pmWmTualkErcItKEkuT+4iAWJWuAaZl47oTqpK3cVZGLSJNKknu0AiY3Q35PJr5PEFPlLiLSkJLkviGI5WrOP2fiUXKfdJW75tBFZLSUrJZZz+Dsm5t+yXV/vDmI9Xz7ASVxERkrJZV7dPJ0y8zYlZl41BmsJyt3VeciMh6UJPdb2x63UlaujeP5mfi1Qax/zn3atLUj7lEyXr5ERMaDkmmZZ7U9bp1Iza2W2ZlqGqe+dDK6uKmn2g8osYvIeFKS3OcFsXXEcyl7MjixA7wQuKwWU/sBEZGGlCT3PwJH8tQpnCeJk/ucDtvImqjtB1Sti8h4VZLcW/dIbbc+M/ZZmfiKINY/LTN37tRpjz5asCciIlKkJLm/gMHJfWZm7HaZeLRWvn9aZvnytROuclfVLiLjWclqmYeDWO6XwtJM/J+D2IRdCqnELiLjXUnlvlsQy9XZuTn3jv3ce/UiJhGRsVJSuUdyyX1KJh5d9DQhK3dV7SIyEZQk99wNOCL1e622RCdU1ThMRKQhJcn9WGBTetz6nqtft8nEtw1iE65yV9UuIhNFyZz75xj4JdD67sRTMxuibbr7ecHYCTXnrsQuIhNJSeW+8xBed0MUNLOTg/CEqNzVM0ZEJqKS5P5bBqZj6upp79ZwFLw3iE2IxmEiIhNRybTMGxj8S2Aj1QqY9qkZB47ObOO4Tm8wHtsPKLGLyERWUrkvDWJR33Yjf0J19yDWPy0zd+7UMa/QVbGLSC8pSe7R2c5cm4F658eWdwWx/mmZVvuB8fIlIjLRlST324ewva9n4i8xsxfXYuPyhKqqdhHpBSXJ/QVBbF1m7Ccy8VuAQ2sxXcQkItKQkuQeza/n5ta3z8SfweCbZI+7yl1Vu4j0ipLVMnOD2JPECX6nzDZucvef1mIT6iImEZGJpKRyj5p+rcmM/TZxa4LpQWxcVe6q2kWklwz3hGpfZuxHqNbA10X3VdWcu4hIQ0qS+z5BLFdqn0c81XN5EBs3lbuqdhHpNSXJ/f62x62qfHZm7HMz8ZOC2LhpPyAi0mtKknv7idN6d8i6aGUNwHwzO6wW66/c+/qm6oIlEZEuKlktM4WBFr+t7yuIb8yxMLONJxl8orV/tcy0aWtZvbpgT0REpEhJcp/OQIOwVsUe3Xyjkxvd/Q+5J8eqcZimZESkV5VMy+zFQNXd+h6tiAG4LhOfb2YH1mLj5oSqiEivKUnu72agcm9935AZOzMTvwt4eS2mpZAiIg0pSe7vDGK5aZmHMvEFwG212JhW7pqSEZFeVjLnHo3JzZDXm4O1XOLul9Ziaj8gItKQksr9wSFsLzcXvySIjVnlrqpdRHpdSXJ/hMHLGHPpcVUmHvWn0Zy7iEhDSpL7TAZPw+SmZXbMxKMWBlotIyLSkJLknrt1XuQq4qmZ7wSxMWs/ICLS60qSe669b+T1xFMwUUU/Ju0HREQmg5LkXr/3KeSnZXLxk4PYmFTuIiKTQclSyA3AJp76i6DVY6bud5lt3NfpDUar/YCSu4hMFiWV+w4MTuSbMmNflok/EcR0QlVEpCElyf0u8tMtdadm4tcHMS2FFBFpSElyP2AIrzuCuKqPfjmochcRaUjJnPu6ILaROMHvQpzIFwSxUW0/oPl2EZlMSir3g4JYtNwRqpty1JO7u/t/BGNVuYuINKQkuX87iNU7PLZE91Y1Mzs3iGvOXUSkISXJ/SvA+vS4NZ8+PzP2N5l4NLUzapW7pmREZLIpmXP/IbB1etz6ZbB1ZuzhmfiNQUz3UBURaUhJ5R7VvbmbdfwpE39aEBu19gMiIpNNSXJ/O4ObgeUuYsp1hbwmiI1a+wERkcmmJLmfx8DqmFaqzNXDOxBX+p82s51zb9BqP6CqXUSkO0qS+5y2x610mbspxxzixH+Cu9fv6NQ/LTN37lRV7SIiXVSS3BcHsVmZsXdl4kcFsf5pmeXL16pyFxHpopLkvhUDUy0b0vdc2vx1Jh7N0esiJhGRhpQshZzPQDJvjc+1/N0zs42o0h/V9gMiIpNJSeV+QxC7NjP2bzLxPYJY45W75txFZLIqSe6L2h63plfmRAOJV8oAHBbE1H5ARKQhJcn9n4Lxew/xffYxs3rr4EYrd1XtIjKZlST3qNXAyszYdeSr9+NrP6tyFxFpSElyvymIta+gafcn8itpdq39rNUyIiINKUnuUUuBbYiT+I3Eyx43BLFG2w+IiExmJcn9wiD2eGbsM4iT+xpgWe4Nut1+QERksitJ7lG73msYXI2vBL5IPF0zC3hBLdZY+wERkcmu5CKmQ4LYPgyu0GdQ9X6PPOLur6zF+i9iarUf6BYleBGZ7Eoq97cGsX2AKbWYUd2JKVpdc0cmphOqIiINKKncozo4V2dHt9OD+J6raj8gItKQkso9urtSLolfQfzLYEzaD4iITFYllfsuQSxa2giwP3FV/5cgpspdRKQhJZX7rUEsd5u9/Ygr93uCWCOVu06mioiUJfdovvwzmbE3Eyf395nZe2oxtR8QEWlISXLfLYj9fWbsOZltOvCDWkxz7iIiDSmZcz8yiM3PjP1IJn6Xu99di/XPuU+btpbVqwv2REREipQk94eA7WuxXMWfK8FzN/cABtoPdIPm3EVEht84bE1m7FWZ+GVBrJH2AyIiUpbc/5HBfdqj1S8AB2biM4NY/wnVVvsBNQwTEemOkuT++jSuPX3ulBmbu4dqlNx1QlVEpCElyf1yBveLWTzE97k/iGkppIhIQ0qS+6uD2F6Zsfdl4i8PYqrcRUQaUpLclwaxhzJjcyX494OYKncRkYaUJPeDg9hzMmOjPjQAzw5iqtxFRBpSktwvCWLLM2O3JG4/cKaZ7VyLqXIXEWlISXJ/SRDbITP2euLkfre7P1iLdbVy1xp3EZEBJVeoHhHE6ndhaplD/Asjeh+1HxARaUhJ5X5lEFuUGRvd2AOgL4j1V+59fVN1AZOISBeVVO5PC2IHZca+NBNfGsRUuYuINKSkcn8iiK3PjM3V0Ht2eoNW4zBV7iIi3VGS3KPWAdtkxkZXogLcEsS0FFJEpCElyT26GXauVq6viGmZZmb1DK6lkCIiDSlJ7tGa9lxy3z8TPwI4sRbrWuWuZZAiIk9VckI1auP7JPFyyNwvi+kMnrLpP6Gqyl1EpLtKKvebGHxh0sbM2OmZ+Bp3/3ktpjl3EZGGlCT3qNNjrtReS5z4HzezN9VimnMXEWlISXJ/EYPn2GdkxvZltrktcGwtpspdRKQhJcn9uiAWrX0HWAlsCuIOPFqL9Vfu06at1X1TRUS6qCS5r6c6gVqPRX5N1Rmybg5wZi3WtfYDIiLyVCWrZQ5g8MqY3C+FP2fi57v7I7WY2g+IiDSkJLlHK2AeJL5Jdq63TNSfpl+r/cBwaFpGRGSw4a6WyZ0B3S8T/3IQ65+WmTt3qubbRUS6qCS5R5X77pmxF2fiZ5vZMbVY/wnV5cvXar5dRKSLSpL7giCWq9yjuzYBvNXdf1KLaSmkiEhDSpJ7yZiW04iXQkbZWxcxiYg0pCRxLyG+L2rkFZltRsldlbuISENKVsvszeArVHOz3dtR/SKoP78iGKvGYSIiDSmp3DcEsSeJq/n5xIn/6CCmyl1EpCElyf2vQayPOInPIE76ewQxzbmLiDSkJLlHfdsfz4x9FXHSf76Z7VOLjbhy1zp3EZFYSXKPGgP8PjP2U8Qtf6cDr67FRtw4TEREYiXJPWodcHxm7A+JG4dtQ7xeHhhoP6ALmEREuqMkuUeV+FTiufV/zWzj7mA7I24/ICIisZKlkMsZ3CRsY/BaBw7NbGMBgztG9i+FbLUfGColeBGRWEnlfm0QWxXEjOpmHZGZwPm1mJZCiog0pCS5XxbEtsmMnZWJP+zu9RU2WgopItKQkuRevwsTxMsjobqVXjRH//0gpspdRKQhJXPuXwpi0YoYgF2If2EsC2JqPyAi0pCSyn1dF97ndUFMlbuISENKkvtjQSxq69spviSIjWjOXStlRETySpL7/w5i6zNjc20JfhDEVLmLiDSkJLmfHsRyc/UPE1/c9N4gNqL2AyIikleS3D/H4Ep9JXES3594tcy+nd5gOO0HREQkryS5Pw/YuhYzBnd/dKpWwFFVH83Fj6j9gIiI5JUshTwiiM0MYkZ18nVG8NydQWxE7QeU4EVE8oa7FPKhzNgfE0/XfDyI6YSqiEhDSpL7zkFsTmbsHZn4x4KY2g+IiDSkJLlfE8S2Iq7Qn8jEdwliqtxFRBpSMud+8BBetxPxL4xHgpjaD4iINKSkcs+18Y0q9L7M2D8FsWFX7jqZKiLSWUly/14QW0d8I+zbiJP+hiCmOXcRkYaUJPffBLHoZh0Qr4oBuDyIac5dRKQhJXPunwpiuZt1PEJc0Uc38eifc582bS2rVxfsiYiIFCmp3PcKYtMyY6MLmGDwPVihrXLv65uq1gMiIl1Uktw/EMRyk+TRkkeI/0IYduMwERHprGRa5ti2x5vo/Athj0z8GZ3eoNU4rISSu4jI5pVU7u0dHVvjS34ptJsXxHRCVUSkISXJPboAqd4lsiV3S77lQUxLIUVEGlKS3KN2vblJlD7ide7PCmKq3EVEGlKS3KOrS1d02F70y2B2EFPlLiLSkJLkflwQuy4zdnpmm1FMlbuISENKK/d6NT6lw/hoymZZEFPlLiLSkJLkvncw7pDM2Pq9VluuDGKq3EVEGlKS3L8ZxHKrZR7MxKPOksO6iElERDavJLm/NIjlVsvskIl3vFlHafsBEREpM9zb7OXkGoo928wOr8WGXLmLiEiZkuR+fxCLljsC3JeJHwbcmnuDVvsBVe4iIt1RktwfLowB3J3biLs/Xgv1T8vMnTtVlbuISBeVJPfDgtjPMmNzJ1q/bGbTa7H+aZnly9eqchcR6aKS5B4tQj8e2BjEc9M1M4D312JaCiki0pCS5B41/VqWee2OmW38FDi4FtNFTCIiDRnuapn9iJdDziVuHHYog3vUqHIXEWlISXK/egivq580bdkN+EQtNqTKXSdURUTKlST3Zw/hdeuI5+IvcPfHajFV7iIiDSlJ7kuGsL2tiO/SFPWcGdJFTCIiUq4kuddPhEL+jku5VsDPDGJDaj8gIiLlSpL7vUEst559VSb+iyCmyl1EpCElN7reL4jlaukjMvG+Tm/Qaj/QiRK8iEi5ksp9TRDLVeiXZuIbgtiQ2g+IiEi5ksp9RmEM8msao+Tempah1X6gEyV4EZFyJZX774NYtNwR4IJM/IYgpqWQIiINKUnurwXOrsVyd1x6GfEVqm8OYsUXMalqFxEZmpLk/j3gjFrsu5mxRxCfbO14JyZV7iIi3VUy515v1Qvw3MzYV2bifw1i/XPuahwmItJdw72I6ajM2O9l4k8EMVXuIiINKUnu0Rr16NZ7kK/oDw1iavkrItKQkuT+QBDLLYVckIlH0z+q3EVEGlKS3K9icOOvZZmx/56J13u5wxDaD4iIyNCUJPfbGXwR0qzM2Isz8egipn6t9gNqGCYi0h0lyf0jwLa12PwOYyPRL4Oi9gMiIjJ0Jcn9ySCWq8T3JL6IKWr52z8t02o/oMpdRKQ7SpJ7e9XemnvPvW4+8UVMi4OYTqiKiDSk5CKm9mTd6uOeS+65ij5aRaOLmEREGlJSuUd3XdqCePrltsw2optsb7Zy15y7iMjwlCT33wWx9cTTL3cTJ/3dgljHi5iU2EVEhq8kuX+TgZOqm9L31ZmxCzPxqDTXnLuISENKkvvngSm18bnb5r2TuKKP5vazlbuqdhGRkSlJ7lHTr+jE6SbgA5lt3JGJqXIXEWlASXLfM4jNzmwr6iAJcGUQy7YfEBGRkSlJ7pFtMvFcCf7uINZfuff1TdVFSyIiXVSS3B8NYrnGYdGySYj7v4eVu4iIjFxJco9Ohi4kXvK4KLONlZ3eoL1xmIiIjFxJcr8giC3NjN2UiZ8exHRCVUSkISXtB54XxLYjXvK4b2YbGzNj1X5ARKQBJZX7H4LYbzJj662BW1YEsUGVu+bcRUS6oyS5vyqIRf3ZNwH/ltnG24KY7qEqItKQkuQe3Zjj8My23pDZxtwgpjl3EZGGDPcK1Zx72h63n1y9Kxiryl1EpCElyT1axriEKnk7Tz1ZehkDSyTbtx2tovk6cD1w/e67z9N8u4hIF5Uk9zVBbB7wCE8tuTcAP2MguT+ZHjvBrfrc/avufoi7H7LDDjsMaadFRKSzkuT+YNvjVrJ+K7CKqg3Blim2wt0fY2BlzCYGGozN7MreiohIkZLk/jLg28AXgPcCD7v7T4Adqda6r6O6qOmANP4dVIl9C6qq/zbg2q7utYiIdFRyEdMLgZOBm6iS9gNmdgzwEFXl3uo9cxZwqrtfaGZbUbX/deAn7v7jru+5iIhkbTa5u/tvia9G/UmH15wHnDeC/RIRkREYbstfEREZx5TcRUR6kJK7iEgPUnIXEelB5uPg0lAzWwncPtb7McpaF4JNNpPxuHXMk8NYHPMe7h5eBVqyFHI03O7uh4z1TowmM7t2sh0zTM7j1jFPDuPtmDUtIyLSg5TcRUR60HhJ7l8d6x0YA5PxmGFyHreOeXIYV8c8Lk6oiohId42Xyl1ERLpIyV1EpAc1ktzN7OVmdruZ3Wlm7w+e38bMvpue/4OZLWx77gMpfruZ/W3pNsdaQ8e81MxuMrMbzGzctU0e7jGb2fZm9kszW2VmX6q95rnpmO80s38zs6hp3Zhp6Jh/lbZ5Q/racXSOpswIjvm/mdmi9HkuMrO/aXtNr37OnY55dD9nd+/qF9XNO5YAewFTgMXAAbUxpwFfSY9PBL6bHh+Qxm8D7Jm2s2XJNsfyq4ljTs8tBeaN9fE1cMzTgSOAU4Ev1V5zDXAYVSfSy4BXjPWxjsIx/wo4ZKyPr4Fjfg6wS3p8ILBsEnzOnY55VD/nJir3Q4E73f0ud38SuBA4vjbmeOCb6fHFwEvTb+7jgQvdfZ27/wW4M22vZJtjqYljHu+GfczuvtqrVtJPuTO6mc0HZrn77736v+FbwAmNHsXQdP2YJ4CRHPP17n5/it8CbJsq3l7+nMNjHpW9rmkiue8K3Nv2830pFo5x9w3AE8D2HV5bss2x1MQxQ3Wzk5+nP+/e3sB+j8RIjrnTNu/bzDbHUhPH3PKN9Kf6h8bZFEW3jvk1wHXuvo7J8zm3H3PLqH3O46X9gMSOcPdlaW7ucjO7zd1/PdY7JV33xvQ5zwS+T3Xns2+N8T51jZk9A/g01S07J4XMMY/q59xE5b4MWND2824pFo6x6pZ821Hdri/32pJtjqUmjhl3b33/K/ADxtd0zUiOudM2d9vMNsdSE8fc/jmvBM6nhz5nM9uN6r/dN7v7krbxPfs5Z4551D/nJpL7H4F9zWxPM5tCdbLhR7UxPwLekh6/Frgizb39CDgxzcvtCexLdeKlZJtjqevHbGbT0294zGw6VQVw8ygcS6mRHHPI3R8AVpjZYelP1jcDP+z+rg9b14/ZzLYys3np8dbAcfTI52xms4EfA+9396tag3v5c84d85h8zg2dbT4G+DPVGecPpthHgVelx1OBi6hOHl4D7NX22g+m191O2xn0aJvj6avbx0x1pn5x+rqlB495KbAcWEU1p3lAih9C9R/9EuBLpKuox8tXt4+ZahXNIuDG9Dl/gbRaarx8DfeYgTOA1cANbV879vLnnDvmsfic1X5ARKQH6QpVEZEepOQuItKDlNxFRHqQkruISA9SchcR6UFK7tIoM9uYLre+2cz+K60D7jT+TDM7fTNjTjCzA9p+/qiZHd2FfT3XzF470u0M8T3fY2bTRvM9ZXJQcpemrXH3g9z9QKo13u/swjZPoFojDoC7f9jdf9GF7Y4qM9sSeA+g5C5dp+Quo+lqUgMmM9vbzH6amqL9xsyeVh9sZv9gZn80s8Vm9n0zm2ZmLwBeBfxr+otg71bFbVUP7ovaXn+UmV2aHr/MzK42s+vM7CIzm9FpR63qpf/J9B7XmtnBZvYzM1tiZqe2bf/XZvZjq/p0f8XMtkjPnWRVT++bzezTbdtdZWafNbPFVBev7QL80sx+mZ7/cnq/W8zsrNr+nJX2/6bWv5eZzTCzb6TYjWb2muEcr/Sgsb4STF+9/QWsSt+3pLqi7+Xp5/8H7JseP5/q8m2AM4HT0+Pt27ZzNvCu9Phc4LVtz51LdQn4VsA9wPQU/zLwJmAe8Ou2+P8EPhzsa/92qa4mfUd6/HmqKwtnAjsAD6X4UVQtfPdKx3d52o9d0n7skPbpCuCE9BoHXtf2nktp69kPzJnYssMAAAJOSURBVG379/oV8Ky2ca3jPw34Wnr8aeD/tL1+Tunx6qu3v9QVUpq2rZndQFWx30rV3XIG8ALgIhvoehr1vD7QzM4GZgMzgJ91eiN332BmPwVeaWYXA8cC7wOOpJrGuSq93xSqvyI2p9VP5CZghlcNn1aa2bq2cwfXuPtdAGZ2AdUNOdYDv3L3h1P8O8CLgUuAjVQdAXNeZ1V7562A+Wm/b0zP/Wf6vgh4dXp8NFXvk9a/wWNmdtwwj1d6iJK7NG2Nux+UThr+jGrO/VzgcXc/aDOvPZeq4l1sZqdQVcqbcyHwj1Tz+9e6+8rUnOpydz9piPve6sO9qe1x6+fW/zv1/h2b6+ex1t03Rk9Y1TjudOB5KUmfS9XDpL4/G+n8/+5wj1d6iObcZVS4ex/wbuCfgT7gL2b2dwBWeXbwspnAA6mL3hvb4ivTc5ErgYOBf6BK9AC/B15oZvuk95tuZvuN8JBaDk3dA7cAXg/8lqqR1JFmNi+dND0p7Vek/VhmUTWdesLMdgJeUfD+l9N2ktrM5tDs8coEoeQuo8bdr6eaYjiJKlm/NZ1YvIX4tokfAv4AXAXc1ha/EPgXM7vezPauvcdG4FKqxHhpij0MnAJcYGY3Uk1RDDqBO0x/pOpqeCvwF+AHXrW0fT/wS6qunovcPdfS9qvAT83sl+6+GLie6ljPpzruzTkbmJNO3C4GXtLw8coEoa6QIsNkZkdRnfw9bqz3RaROlbuISA9S5S4i0oNUuYuI9CAldxGRHqTkLiLSg5TcRUR6kJK7iEgP+v8NRslOf7LLkwAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "importances = model_2.feature_importances_\n",
    "indices = np.argsort(importances)\n",
    "\n",
    "plt.figure(1)\n",
    "plt.title('Feature Importances')\n",
    "plt.barh(range(len(indices)), importances[indices], color='b', align='center')\n",
    "plt.yticks(range(len(indices)), train_features[indices])\n",
    "plt.xlabel('Relative Importance')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "importances={k: v for v, k in enumerate(importances)}\n",
    "importances = dict([(value, key) for key, value in importances.items()]) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "text_dictn=tfidf_text.vocabulary_\n",
    "feature_dict=dict([(value, key) for key, value in text_dictn.items()]) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "def mergeDict(dict1, dict2):\n",
    "   ''' Merge dictionaries and keep values of common keys in list'''\n",
    "   dict3 = {**dict1, **dict2}\n",
    "   for key, value in dict3.items():\n",
    "       if key in dict1 and key in dict2:\n",
    "               dict3[key] = [value , dict1[key]]\n",
    " \n",
    "   return dict3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "final_dict=mergeDict(importances, feature_dict)\n",
    "#final_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "rem_list = [500, 501, 502] \n",
    "res = dict([(key, val) for key, val in \n",
    "           final_dict.items() if key not in rem_list]) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "listofTuples = sorted(res.items() , reverse=True, key=lambda x: x[1][1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(335, ['php', 0.026087815527021108]),\n",
       " (14, ['android', 0.023758250673924706]),\n",
       " (224, ['java', 0.0213241506634514]),\n",
       " (363, ['python', 0.02063064279299344]),\n",
       " (227, ['javascript', 0.014087009775242973]),\n",
       " (230, ['jqueri', 0.010335043087206914]),\n",
       " (59, ['code', 0.010068408000823441]),\n",
       " (154, ['function', 0.009624995900247322]),\n",
       " (479, ['var', 0.009036939274172198]),\n",
       " (62, ['code pre', 0.008160413291616679])]"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "listofTuples[0:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"feature_importance.txt\", \"wb\") as fp:   #Pickling\n",
    "    pickle.dump(listofTuples, fp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "pickle.dump(model_2, open('RF_model', 'wb'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Applying Xgboost on the dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "eval_set = [(train_features, y_train),(val_features, y_val)]\n",
    "model_3 = XGBClassifier(learning_rate =0.1,n_estimators=100, max_depth=3,nthread=4,seed=27,objective=\"multi:softmax\",num_class=20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "# n_estimators = range(50, 400, 50)\n",
    "# param_grid = dict(n_estimators=n_estimators)\n",
    "# kfold = StratifiedKFold(n_splits scoring=\"neg_log_loss\", n_jobs=-1, cv=kfold)\n",
    "# result = grid_search.fit(X, label_encoded_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0]\tvalidation_0-merror:0.44607\tvalidation_1-merror:0.44070\n",
      "[1]\tvalidation_0-merror:0.44089\tvalidation_1-merror:0.43450\n",
      "[2]\tvalidation_0-merror:0.44161\tvalidation_1-merror:0.43730\n",
      "[3]\tvalidation_0-merror:0.43979\tvalidation_1-merror:0.43360\n",
      "[4]\tvalidation_0-merror:0.43912\tvalidation_1-merror:0.43500\n",
      "[5]\tvalidation_0-merror:0.43662\tvalidation_1-merror:0.43300\n",
      "[6]\tvalidation_0-merror:0.43593\tvalidation_1-merror:0.43190\n",
      "[7]\tvalidation_0-merror:0.43461\tvalidation_1-merror:0.42990\n",
      "[8]\tvalidation_0-merror:0.43296\tvalidation_1-merror:0.42960\n",
      "[9]\tvalidation_0-merror:0.43066\tvalidation_1-merror:0.42810\n",
      "[10]\tvalidation_0-merror:0.42940\tvalidation_1-merror:0.42690\n",
      "[11]\tvalidation_0-merror:0.42745\tvalidation_1-merror:0.42500\n",
      "[12]\tvalidation_0-merror:0.42559\tvalidation_1-merror:0.42420\n",
      "[13]\tvalidation_0-merror:0.42490\tvalidation_1-merror:0.42390\n",
      "[14]\tvalidation_0-merror:0.42281\tvalidation_1-merror:0.42080\n",
      "[15]\tvalidation_0-merror:0.42265\tvalidation_1-merror:0.42140\n",
      "[16]\tvalidation_0-merror:0.42179\tvalidation_1-merror:0.42020\n",
      "[17]\tvalidation_0-merror:0.42014\tvalidation_1-merror:0.41850\n",
      "[18]\tvalidation_0-merror:0.41960\tvalidation_1-merror:0.41790\n",
      "[19]\tvalidation_0-merror:0.41906\tvalidation_1-merror:0.41840\n",
      "[20]\tvalidation_0-merror:0.41811\tvalidation_1-merror:0.41800\n",
      "[21]\tvalidation_0-merror:0.41707\tvalidation_1-merror:0.41840\n",
      "[22]\tvalidation_0-merror:0.41733\tvalidation_1-merror:0.41680\n",
      "[23]\tvalidation_0-merror:0.41691\tvalidation_1-merror:0.41720\n",
      "[24]\tvalidation_0-merror:0.41640\tvalidation_1-merror:0.41670\n",
      "[25]\tvalidation_0-merror:0.41551\tvalidation_1-merror:0.41540\n",
      "[26]\tvalidation_0-merror:0.41480\tvalidation_1-merror:0.41500\n",
      "[27]\tvalidation_0-merror:0.41391\tvalidation_1-merror:0.41470\n",
      "[28]\tvalidation_0-merror:0.41351\tvalidation_1-merror:0.41390\n",
      "[29]\tvalidation_0-merror:0.41255\tvalidation_1-merror:0.41300\n",
      "[30]\tvalidation_0-merror:0.41199\tvalidation_1-merror:0.41260\n",
      "[31]\tvalidation_0-merror:0.41125\tvalidation_1-merror:0.41300\n",
      "[32]\tvalidation_0-merror:0.41078\tvalidation_1-merror:0.41300\n",
      "[33]\tvalidation_0-merror:0.41046\tvalidation_1-merror:0.41270\n",
      "[34]\tvalidation_0-merror:0.40956\tvalidation_1-merror:0.41080\n",
      "[35]\tvalidation_0-merror:0.40914\tvalidation_1-merror:0.41050\n",
      "[36]\tvalidation_0-merror:0.40840\tvalidation_1-merror:0.41020\n",
      "[37]\tvalidation_0-merror:0.40786\tvalidation_1-merror:0.40980\n",
      "[38]\tvalidation_0-merror:0.40744\tvalidation_1-merror:0.40950\n",
      "[39]\tvalidation_0-merror:0.40686\tvalidation_1-merror:0.40920\n",
      "[40]\tvalidation_0-merror:0.40618\tvalidation_1-merror:0.40920\n",
      "[41]\tvalidation_0-merror:0.40566\tvalidation_1-merror:0.40920\n",
      "[42]\tvalidation_0-merror:0.40525\tvalidation_1-merror:0.40810\n",
      "[43]\tvalidation_0-merror:0.40475\tvalidation_1-merror:0.40780\n",
      "[44]\tvalidation_0-merror:0.40411\tvalidation_1-merror:0.40700\n",
      "[45]\tvalidation_0-merror:0.40368\tvalidation_1-merror:0.40680\n",
      "[46]\tvalidation_0-merror:0.40337\tvalidation_1-merror:0.40730\n",
      "[47]\tvalidation_0-merror:0.40253\tvalidation_1-merror:0.40750\n",
      "[48]\tvalidation_0-merror:0.40206\tvalidation_1-merror:0.40650\n",
      "[49]\tvalidation_0-merror:0.40142\tvalidation_1-merror:0.40610\n",
      "[50]\tvalidation_0-merror:0.40115\tvalidation_1-merror:0.40520\n",
      "[51]\tvalidation_0-merror:0.40050\tvalidation_1-merror:0.40570\n",
      "[52]\tvalidation_0-merror:0.40011\tvalidation_1-merror:0.40560\n",
      "[53]\tvalidation_0-merror:0.39968\tvalidation_1-merror:0.40510\n",
      "[54]\tvalidation_0-merror:0.39945\tvalidation_1-merror:0.40500\n",
      "[55]\tvalidation_0-merror:0.39874\tvalidation_1-merror:0.40460\n",
      "[56]\tvalidation_0-merror:0.39811\tvalidation_1-merror:0.40440\n",
      "[57]\tvalidation_0-merror:0.39751\tvalidation_1-merror:0.40420\n",
      "[58]\tvalidation_0-merror:0.39690\tvalidation_1-merror:0.40370\n",
      "[59]\tvalidation_0-merror:0.39645\tvalidation_1-merror:0.40400\n",
      "[60]\tvalidation_0-merror:0.39615\tvalidation_1-merror:0.40320\n",
      "[61]\tvalidation_0-merror:0.39540\tvalidation_1-merror:0.40320\n",
      "[62]\tvalidation_0-merror:0.39516\tvalidation_1-merror:0.40230\n",
      "[63]\tvalidation_0-merror:0.39467\tvalidation_1-merror:0.40180\n",
      "[64]\tvalidation_0-merror:0.39421\tvalidation_1-merror:0.40190\n",
      "[65]\tvalidation_0-merror:0.39387\tvalidation_1-merror:0.40150\n",
      "[66]\tvalidation_0-merror:0.39322\tvalidation_1-merror:0.40100\n",
      "[67]\tvalidation_0-merror:0.39263\tvalidation_1-merror:0.40160\n",
      "[68]\tvalidation_0-merror:0.39211\tvalidation_1-merror:0.40150\n",
      "[69]\tvalidation_0-merror:0.39185\tvalidation_1-merror:0.40080\n",
      "[70]\tvalidation_0-merror:0.39117\tvalidation_1-merror:0.40040\n",
      "[71]\tvalidation_0-merror:0.39075\tvalidation_1-merror:0.40000\n",
      "[72]\tvalidation_0-merror:0.39036\tvalidation_1-merror:0.39940\n",
      "[73]\tvalidation_0-merror:0.38967\tvalidation_1-merror:0.39970\n",
      "[74]\tvalidation_0-merror:0.38960\tvalidation_1-merror:0.39970\n",
      "[75]\tvalidation_0-merror:0.38898\tvalidation_1-merror:0.39970\n",
      "[76]\tvalidation_0-merror:0.38851\tvalidation_1-merror:0.39880\n",
      "[77]\tvalidation_0-merror:0.38831\tvalidation_1-merror:0.39930\n",
      "[78]\tvalidation_0-merror:0.38774\tvalidation_1-merror:0.39860\n",
      "[79]\tvalidation_0-merror:0.38739\tvalidation_1-merror:0.39850\n",
      "[80]\tvalidation_0-merror:0.38703\tvalidation_1-merror:0.39860\n",
      "[81]\tvalidation_0-merror:0.38682\tvalidation_1-merror:0.39870\n",
      "[82]\tvalidation_0-merror:0.38638\tvalidation_1-merror:0.39860\n",
      "[83]\tvalidation_0-merror:0.38594\tvalidation_1-merror:0.39900\n",
      "[84]\tvalidation_0-merror:0.38555\tvalidation_1-merror:0.39800\n",
      "[85]\tvalidation_0-merror:0.38527\tvalidation_1-merror:0.39810\n",
      "[86]\tvalidation_0-merror:0.38504\tvalidation_1-merror:0.39730\n",
      "[87]\tvalidation_0-merror:0.38453\tvalidation_1-merror:0.39780\n",
      "[88]\tvalidation_0-merror:0.38404\tvalidation_1-merror:0.39690\n",
      "[89]\tvalidation_0-merror:0.38371\tvalidation_1-merror:0.39630\n",
      "[90]\tvalidation_0-merror:0.38327\tvalidation_1-merror:0.39590\n",
      "[91]\tvalidation_0-merror:0.38284\tvalidation_1-merror:0.39650\n",
      "[92]\tvalidation_0-merror:0.38272\tvalidation_1-merror:0.39670\n",
      "[93]\tvalidation_0-merror:0.38274\tvalidation_1-merror:0.39640\n",
      "[94]\tvalidation_0-merror:0.38207\tvalidation_1-merror:0.39630\n",
      "[95]\tvalidation_0-merror:0.38169\tvalidation_1-merror:0.39590\n",
      "[96]\tvalidation_0-merror:0.38144\tvalidation_1-merror:0.39610\n",
      "[97]\tvalidation_0-merror:0.38124\tvalidation_1-merror:0.39580\n",
      "[98]\tvalidation_0-merror:0.38104\tvalidation_1-merror:0.39540\n",
      "[99]\tvalidation_0-merror:0.38066\tvalidation_1-merror:0.39570\n"
     ]
    }
   ],
   "source": [
    "model_3.fit(train_features,y_train, eval_set=eval_set)\n",
    "y_pred = model_3.predict(test_features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "print(confusion_matrix(y_test, y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
